import pandas as pd
import numpy as np
import pickle
import json
# Visualisation libraries
# Maps
import folium
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use a dataset contains Environmental Health Violations for Restaurants and Markets in Los Angeles County. This dataset can be found [1] and [2].
Los Angeles County Environmental Health is responsible for checking food violations for all unincorporated areas and 85 of the 88 cities in the County. This dataset does not include Pasadena, Long Beach or Vernon (each has its own city health department).
Violations_df = pd.read_csv('LA/Clean_Violations_df.csv')
Inspections_df = pd.read_csv('LA/Clean_Inspections_df.csv')
Data = pd.read_csv('LA/Data_Inspections_Violations_df.csv')
Violations_matrix = pd.read_csv('LA/Violations_matrix_df.csv')
# Dictionaries
with open('LA/vDict.pkl', 'rb') as fp:
Violation_Dictionary= pickle.load(fp)
Data['Facility Zip'] = Data['Facility Zip'].astype(str)
Data['Facility Zip'] = Data['Facility Zip'].apply(lambda x: x[:5])
Facilities_Geographical_df = Data.groupby('Facility Zip').agg(Average_Score=('Score', 'mean'),
Total_Facilities=('Facility ID', pd.Series.nunique)).reset_index()
Facilities_Geographical_df = Facilities_Geographical_df.rename(columns = {'Average_Score':'Average Score',
'Total_Facilities':'Total Facilities'})
Facilities_Geographical_df = Facilities_Geographical_df.dropna()
Facilities_Geographical_df.head().style.hide_index()
| Facility Zip | Average Score | Total Facilities |
|---|---|---|
| 90001 | 90.785648 | 269 |
| 90002 | 92.230228 | 70 |
| 90003 | 91.138814 | 257 |
| 90004 | 92.719656 | 258 |
| 90005 | 91.042819 | 272 |
Next, we can export a geojson file from here. However, we would like to reduce the size of this file for the sake of having an optimal computation.
# loading the GeoJSON file
with open('LA/LA_ZIP_Codes.geojson', 'r') as jsonFile:
Zipcode_Data = json.load(jsonFile)
# Creating a list from Zip codes
mylist = Facilities_Geographical_df['Facility Zip'].unique().tolist()
# removing ZIP codes that are not in our dataset
temp = []
for i in range(len(Zipcode_Data['features'])):
if Zipcode_Data['features'][i]['properties']['name'] in mylist:
temp.append(Zipcode_Data['features'][i])
# creating a new JSON file
Reduced_Zipcode_Data = dict.fromkeys(['type','features'])
Reduced_Zipcode_Data['type'] = 'FeatureCollection'
Reduced_Zipcode_Data['features'] = temp
del Zipcode_Data, mylist, temp
# save the JSON file
open("LA/reduced_LA_ZIP_Codes.json", "w").write(json.dumps(Reduced_Zipcode_Data,
sort_keys=True, indent=4, separators=(',', ': ')))
del Reduced_Zipcode_Data
def plot_map(Inp_Column, Text_Legend = '', Inp_Df = Facilities_Geographical_df, Zoom_Level=8):
# reading of the updated GeoJSON file
Geographical_Data = r'LA/reduced_LA_ZIP_Codes.json'
# initiating a Folium map
m = folium.Map(location = [34.052, -118.243], zoom_start = Zoom_Level)
# creating a map
m.choropleth(geo_data = Geographical_Data, fill_opacity = 0.8, line_opacity = 0.2,
data = Inp_Df, key_on = 'feature.properties.name', columns = ['Facility Zip', Inp_Column],
fill_color = 'RdYlGn', legend_name = Text_Legend)
folium.LayerControl().add_to(m)
# Show the map
return m
plot_map('Average Score', 'The Average Score of Facilities')
plot_map('Total Facilities','Total Facilities')
Frist, let's addd the Violation Matrix to our Data
Data= pd.merge(Data, Violations_matrix, on='Facility ID', how='right')
Moreover, previously, we found the following violation code as the most correlated ones.
F035, W017, CV20, CV29, W021, W011, F033, F040, F044
Thus,
Most_Corr = ['F035', 'W017', 'CV20', 'CV29', 'W021', 'W011', 'F033', 'F040', 'F044']
Facility_Violations_df = Data.groupby(['Facility Zip','Facility ID'])[Most_Corr].agg({np.mean})
Facility_Violations_df = Facility_Violations_df.groupby(level=0).mean()
Facility_Violations_df.reset_index(inplace=True)
Facility_Violations_df.columns = Facility_Violations_df.columns.droplevel(1)
display(Facility_Violations_df.head().style.hide_index())
| Facility Zip | F035 | W017 | CV20 | CV29 | W021 | W011 | F033 | F040 | F044 |
|---|---|---|---|---|---|---|---|---|---|
| 90001 | 1.602230 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.750929 | 0.940520 | 2.167286 |
| 90002 | 1.485714 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.714286 | 0.800000 | 1.800000 |
| 90003 | 2.007782 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.400778 | 1.077821 | 2.482490 |
| 90004 | 1.554264 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.414729 | 0.872093 | 1.686047 |
| 90005 | 1.511029 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.246324 | 1.238971 | 2.352941 |
temp = Facility_Violations_df[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
for vCode in mylist[1:]:
display(plot_map(vCode, 'Average Violation: %s' % Violation_Dictionary[vCode] ,Facility_Violations_df))
PE_Seats_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE Seats']).size().unstack(1)).reset_index()
PE_Seats_Counts.head().style.hide_index()
| Facility Zip | 0-10 SQ. FT. | 0-30 SQ. FT. | 0-999 SQ. FT. | 1-1999 SQ. FT. | 1-4999 SQ. FT. | 1000-1999 SQ. FT. | 10000+ SQ. FT. | 151+ SQ. FT. | 2000+ SQ. FT. | 2000-4999 SQ. FT. | 2000-5999 SQ. FT. | 31-60 SQ. FT. | 6000+ SQ. FT. | 61-150 | ANNUAL | Other |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 90001 | nan | 2432.000000 | nan | 1552.000000 | nan | nan | nan | 121.000000 | 449.000000 | nan | nan | 792.000000 | nan | 33.000000 | nan | nan |
| 90002 | nan | 469.000000 | nan | 442.000000 | nan | nan | nan | 18.000000 | 116.000000 | nan | nan | 73.000000 | nan | 20.000000 | nan | nan |
| 90003 | nan | 2531.000000 | nan | 2268.000000 | nan | nan | nan | nan | 200.000000 | nan | nan | 397.000000 | nan | 151.000000 | nan | nan |
| 90004 | nan | 1532.000000 | nan | 404.000000 | nan | nan | nan | 67.000000 | 343.000000 | nan | nan | 1225.000000 | nan | 831.000000 | nan | 14.000000 |
| 90005 | nan | 1859.000000 | nan | 524.000000 | nan | nan | nan | 389.000000 | 175.000000 | nan | nan | 1847.000000 | nan | 1278.000000 | nan | nan |
temp = PE_Seats_Counts[0:1].any().to_dict()
mylist = list({k:v for k, v in temp.items() if v == True}.keys())
for s in mylist[1:]:
display(plot_map(s, 'Facility Total Count (%s)' % s ,PE_Seats_Counts))
PE_type_counts = pd.DataFrame(Data.groupby(['Facility Zip','PE Type']).size().unstack(1)).reset_index()
PE_type_counts.head().style.hide_index()
| Facility Zip | Caterer | Food Mkt Retail | Food Processing Wholesale | Food Vehicle Commissary | Food Warehouse | Interim Housing FF | LIC HTH Care Food FAC | Private School Cafeteria | Restaurant | Skc Tenant Retail Food Operator | Wholesale Food Complex |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 90001 | nan | 2001.000000 | nan | nan | nan | nan | nan | nan | 3378.000000 | nan | nan |
| 90002 | nan | 558.000000 | nan | nan | nan | nan | nan | nan | 580.000000 | nan | nan |
| 90003 | nan | 2468.000000 | nan | nan | nan | nan | nan | nan | 3079.000000 | nan | nan |
| 90004 | nan | 740.000000 | nan | nan | nan | 7.000000 | nan | 14.000000 | 3655.000000 | nan | nan |
| 90005 | nan | 699.000000 | nan | nan | nan | nan | nan | nan | 5373.000000 | nan | nan |
temp = PE_type_counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
for s in mylist[1:]:
display(plot_map(s, 'PE Type: %s' % s , PE_type_counts))
PE_Risk_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE Risk']).size().unstack(1)).reset_index()
PE_Risk_Counts.head().style.hide_index()
| Facility Zip | High Risk | Low Risk | Moderate Risk | None | Seats High Risk | Seats Low Risk | Seats Moderate Risk |
|---|---|---|---|---|---|---|---|
| 90001 | 896.000000 | 1011.000000 | 94.000000 | nan | 2499.000000 | 84.000000 | 795.000000 |
| 90002 | 243.000000 | 237.000000 | 78.000000 | nan | 338.000000 | 10.000000 | 232.000000 |
| 90003 | 667.000000 | 1651.000000 | 150.000000 | nan | 2056.000000 | 33.000000 | 990.000000 |
| 90004 | 350.000000 | 358.000000 | 32.000000 | 21.000000 | 2835.000000 | 63.000000 | 757.000000 |
| 90005 | 304.000000 | 366.000000 | 29.000000 | nan | 4465.000000 | 95.000000 | 813.000000 |
plot_map('Low Risk', 'PE Risk: Low Risk' ,PE_Risk_Counts)
plot_map('Moderate Risk', 'PE Risk: Moderate Risk' ,PE_Risk_Counts)
plot_map('High Risk', 'PE Risk: High Risk' ,PE_Risk_Counts)